packages

# install.packages ('gapminder')
library(gapminder)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6      ✔ purrr   0.3.4 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.2      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

shortcuts

alt+- will add <-
shift+ctrl+c to add # infront of a line
‘—-’ for a header, so it is easy to navigate through the script
command +shift + m
ctrl+alt+i for new code chunk # syntax Plain text
end a line with two spaces to start a new paragraph.
italics and italics
bold and bold
superscript2
~strikethrough
link to rstudio

logical operations

1==1 # equality
1!=3 #unequal
13<14 #13 smaller than 14
14>13 #14 bigger than 13
12>=0 #12 greater or equal to zero
12<=3 #12 smaller or equal to zero

creating data.frame

family

name <- c('saneesh', 'sanusha', 'appu', 'kishan')
weight <- c(63,48, 20, NA)
height <- c(164, 150, NA, 75)
family <- data.frame(name, weight, height)
family %>% as_tibble()

data frame with unequal values 10 and 8

library(tidyverse)
data <- data.frame(sex=c(rep('female', 10), rep('male', 8)), 
                   score=c(rnorm(n= 10, mean = 7.56, sd = 1.978), rnorm(n= 8, mean=7.75, sd= 1.631)))

data
data %>% group_by(sex) %>% 
  summarise(score= n()) %>% 
  mutate(freq=score/sum(score)*100)

tibble

library(tidyverse)
years <- tribble(
  ~Location, ~Year, ~Month, ~Day, ~Lenght,
  "Sydney", 2000, 9, 15,12.1213,
  "Athens", 2004, 8, 13, 12.1212,
  "Beijing", 2008, 8, 8,13.212,
  "London", 2012, 7, 27,13.1212,
  "Rio de Janeiro", 2016, 8, 5,65.00
)

tabyl

tabyl

table

mutate round

# run privious code chunk
library(gt)
years %>% gt()
Location Year Month Day Lenght
Sydney 2000 9 15 12.1213
Athens 2004 8 13 12.1212
Beijing 2008 8 8 13.2120
London 2012 7 27 13.1212
Rio de Janeiro 2016 8 5 65.0000
years %>% 
  mutate(Lenght= round(Lenght, 2)) %>% 
  gt() %>% 
  tab_options(column_labels.font.size = 11,
              column_labels.font.weight = "bold",
              table.font.size = 10,
              ) %>% 
  opt_table_outline(style = "solid", width = px(2))
Location Year Month Day Lenght
Sydney 2000 9 15 12.12
Athens 2004 8 13 12.12
Beijing 2008 8 8 13.21
London 2012 7 27 13.12
Rio de Janeiro 2016 8 5 65.00
library(janitor)
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
data <- data.frame(HairEyeColor)

data %>% tabyl(Hair, Eye) %>% 
  adorn_percentages('row') %>% 
  adorn_pct_formatting(digits = 2) %>% 
  adorn_ns() %>% 
  knitr::kable()
Hair Brown Blue Hazel Green
Black 25.00% (2) 25.00% (2) 25.00% (2) 25.00% (2)
Brown 25.00% (2) 25.00% (2) 25.00% (2) 25.00% (2)
Red 25.00% (2) 25.00% (2) 25.00% (2) 25.00% (2)
Blond 25.00% (2) 25.00% (2) 25.00% (2) 25.00% (2)

is.na

# identify location of NAs in vector
which(is.na(family))
## [1]  8 11
colSums(is.na(family)) 
##   name weight height 
##      0      1      1

replace na

mat <- matrix(sample(c(NA, 1:5), 50, replace = TRUE), 5)
df <- as.data.frame(mat)
df %>% replace(is.na(.), 0)%>% view()

drop na

see spread & gather # clean names

# install.packages('janitor')
library(janitor)

id <- (c(1,1,2,2,3,3))
Country <- c('Angola', 'Angola','Botswana', 'Botswana','Zimbabwe','Zimbabwe')
year <- c('2006', '2007', '2008', '2009', '2010', '2006')
bank.ratio <- c(24,25,38,34,42,49)
Reserve.ratio <- c(77,59,64,65,57,86)
broad.money <- c(163,188,317,361,150,288)


bank <- data.frame(id, Country, year, bank.ratio, Reserve.ratio,broad.money)

bank %>% view()
  as_tibble()
## Warning: The `x` argument of `as_tibble()` can't be missing as of tibble 3.0.0.
bank <- bank %>% clean_names() # replaced . with _
  
glimpse(bank)  
## Rows: 6
## Columns: 6
## $ id            <dbl> 1, 1, 2, 2, 3, 3
## $ country       <chr> "Angola", "Angola", "Botswana", "Botswana", "Zimbabwe", …
## $ year          <chr> "2006", "2007", "2008", "2009", "2010", "2006"
## $ bank_ratio    <dbl> 24, 25, 38, 34, 42, 49
## $ reserve_ratio <dbl> 77, 59, 64, 65, 57, 86
## $ broad_money   <dbl> 163, 188, 317, 361, 150, 288
bank <- bank %>% clean_names() # replaced . with _

filter bank data frame below such that it retains a country if a given id is satisfied e.g. filtering a data frame that has countries with id 1 and 2 only

bank %>% 
  filter(id%in% c(1,2)) %>% 
  as_tibble()

summarise fund available with each countries

bank %>% 
  group_by(country) %>% 
  summarise(fund=sum(broad_money)) %>% 
  as_tibble()

rename column

column: new name= old name

iris %>% 
  rename(S.len=Sepal.Length,
         Sp.= Species) %>% head(3)

rename to lower

iris %>% 
  rename_with(tolower) %>% head(3)

rename to lower specific columns

iris %>% select_at(vars(Species, Petal.Length), tolower) %>% head(3)

add name to a nameless column

library(tidyverse)
mtcars <- mtcars %>% as_tibble(rownames="cars")

add column

library(tibble)
iris %>% add_column(ob_no=1:150) %>% head(5)
iris %>% as_tibble() %>% head(3)
summary(gapminder)
##         country        continent        year         lifeExp     
##  Afghanistan:  12   Africa  :624   Min.   :1952   Min.   :23.60  
##  Albania    :  12   Americas:300   1st Qu.:1966   1st Qu.:48.20  
##  Algeria    :  12   Asia    :396   Median :1980   Median :60.71  
##  Angola     :  12   Europe  :360   Mean   :1980   Mean   :59.47  
##  Argentina  :  12   Oceania : 24   3rd Qu.:1993   3rd Qu.:70.85  
##  Australia  :  12                  Max.   :2007   Max.   :82.60  
##  (Other)    :1632                                                
##       pop              gdpPercap       
##  Min.   :6.001e+04   Min.   :   241.2  
##  1st Qu.:2.794e+06   1st Qu.:  1202.1  
##  Median :7.024e+06   Median :  3531.8  
##  Mean   :2.960e+07   Mean   :  7215.3  
##  3rd Qu.:1.959e+07   3rd Qu.:  9325.5  
##  Max.   :1.319e+09   Max.   :113523.1  
## 
str(gapminder)
## tibble [1,704 × 6] (S3: tbl_df/tbl/data.frame)
##  $ country  : Factor w/ 142 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ continent: Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ year     : int [1:1704] 1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
##  $ lifeExp  : num [1:1704] 28.8 30.3 32 34 36.1 ...
##  $ pop      : int [1:1704] 8425333 9240934 10267083 11537966 13079460 14880372 12881816 13867957 16317921 22227415 ...
##  $ gdpPercap: num [1:1704] 779 821 853 836 740 ...

recode observation

change name of observation— mutate (variable=recode (variable, ‘old name’=‘new name’)))

gapminder %>% 
 mutate(country=recode(country, 'India'='IND' )) %>% 
  filter(country=='IND') %>% head(3)

select

gapminder %>% 
  select(year, country, gdpPercap)  %>% head(3)
msleep %>% select(starts_with("sleep")) %>% head(3)

do not select

iris %>% select(-Sepal.Length, -Species) %>% head(3)

or

iris %>% select(-c(Sepal.Length)) %>% head(3)
iris %>% select(!Sepal.Length) %>% head(3)

ends_with

iris %>% select(ends_with('length')) %>% head(3)

starts_with

iris %>% select(starts_with('Sepal')) %>% head(3)

filter

gapminder %>% 
  select(year, country, lifeExp) %>% 
  filter(country=="Eritrea", year>1950)  %>% head(3)
gapminder %>% filter(country=="Canada") %>% head(3) # from gapminder data filter country Canada and show only 2 observations

except

gapminder %>% filter(country!="Oman") %>% head(3) # from gapminder data filter all the other countries except Oman

omit

iris %>% filter(Species!='setosa') %>% glimpse()
## Rows: 100
## Columns: 5
## $ Sepal.Length <dbl> 7.0, 6.4, 6.9, 5.5, 6.5, 5.7, 6.3, 4.9, 6.6, 5.2, 5.0, 5.…
## $ Sepal.Width  <dbl> 3.2, 3.2, 3.1, 2.3, 2.8, 2.8, 3.3, 2.4, 2.9, 2.7, 2.0, 3.…
## $ Petal.Length <dbl> 4.7, 4.5, 4.9, 4.0, 4.6, 4.5, 4.7, 3.3, 4.6, 3.9, 3.5, 4.…
## $ Petal.Width  <dbl> 1.4, 1.5, 1.5, 1.3, 1.5, 1.3, 1.6, 1.0, 1.3, 1.4, 1.0, 1.…
## $ Species      <fct> versicolor, versicolor, versicolor, versicolor, versicolo…

omit multiple

iris %>% filter(!Species %in% c('setosa', 'versicolor')) %>% glimpse()
## Rows: 50
## Columns: 5
## $ Sepal.Length <dbl> 6.3, 5.8, 7.1, 6.3, 6.5, 7.6, 4.9, 7.3, 6.7, 7.2, 6.5, 6.…
## $ Sepal.Width  <dbl> 3.3, 2.7, 3.0, 2.9, 3.0, 3.0, 2.5, 2.9, 2.5, 3.6, 3.2, 2.…
## $ Petal.Length <dbl> 6.0, 5.1, 5.9, 5.6, 5.8, 6.6, 4.5, 6.3, 5.8, 6.1, 5.1, 5.…
## $ Petal.Width  <dbl> 2.5, 1.9, 2.1, 1.8, 2.2, 2.1, 1.7, 1.8, 1.8, 2.5, 2.0, 1.…
## $ Species      <fct> virginica, virginica, virginica, virginica, virginica, vi…

filter between

iris %>% filter(Petal.Width >=2 & Petal.Width <= 5) %>% glimpse()
## Rows: 29
## Columns: 5
## $ Sepal.Length <dbl> 6.3, 7.1, 6.5, 7.6, 7.2, 6.5, 6.8, 5.7, 5.8, 6.4, 7.7, 7.…
## $ Sepal.Width  <dbl> 3.3, 3.0, 3.0, 3.0, 3.6, 3.2, 3.0, 2.5, 2.8, 3.2, 3.8, 2.…
## $ Petal.Length <dbl> 6.0, 5.9, 5.8, 6.6, 6.1, 5.1, 5.5, 5.0, 5.1, 5.3, 6.7, 6.…
## $ Petal.Width  <dbl> 2.5, 2.1, 2.2, 2.1, 2.5, 2.0, 2.1, 2.0, 2.4, 2.3, 2.2, 2.…
## $ Species      <fct> virginica, virginica, virginica, virginica, virginica, vi…

filter matching

library(tidyverse)
mtcars <- mtcars %>% rownames_to_column 
mtcars %>% 
  filter(str_detect(rowname, 'Merc')) %>% head(3) # filter only 'Merc'
mtcars %>% filter(!str_detect(rowname, 'Merc')) %>% head(3) # filter everything except 'Merc'

pull

iris %>% pull(Species) %>% head(3) # returns vector values
## [1] setosa setosa setosa
## Levels: setosa versicolor virginica
iris %>% select(Species) %>% head (3) # returns a table with one column
iris %>% select(everything()) %>% head(3)

multiple conditions

gapminder %>% 
  filter(country=="Oman" &
           year>1980 &
           year<=2000) %>% head(4)
gapminder %>% 
  select(country, year) %>% 
  filter(year>=1980, country=="India"|
           country=="Oman"|
           country=="Canada") %>% head(4)
gapminder %>% filter(country!="Oman") %>% head(3) # from gapminder data filter all the other countires exept Oman

filter multipe using %in%

gapminder %>% filter(country %in% c('Hungary','Iceland', 'Mongolia')) %>% head(3)
target <- c('Hungary','Iceland', 'Mongolia')
gapminder %>% filter(country %in% target) %>% head (3)
friends <- data.frame(Names=c('Saneesh', 'Appu', 'Shruti', 'Aradhana', 'Arathi', 'James Bond'),
                      age=c(40,9, 25, 25, 25, 50))
# data frame is friends
# columns in friends are Names, Age, Height, etc.
# Colum Name have 'Saneesh', 'Appu', 'Shruti', 'Aradhana', 'Arathi', 'James Bond'
# We want to filter information related to Sanees and James Bond only, so we created a vector with
# these names in it.

target <- c('Appu', 'James Bond') #and then

friends %>% filter(Names %in% target)
# or 
friends %>% filter(Names== 'Appu'| Names== 'James Bond')
# or 
friends %>% filter(Names %in% c('Appu', 'James Bond'))

drop

gapminder %>% 
  select(-year,-pop) %>% 
  head(5)

group by & summarise

gapminder %>% 
  filter(year==2007) %>% 
  group_by(country) %>% 
  summarise(meanLE=mean(lifeExp)) %>% 
  arrange(meanLE,decreasing = TRUE) %>% head(3)
gapminder %>% 
  group_by(country) %>% 
  summarise(minLE=min(lifeExp)) %>% 
  arrange(minLE,decreasing=FALSE)  %>% head(3)

grouped by continent, then summarise two things, first n=n() number of rows in which each continent are or the size of each group, then the mean of the mean of the lifeExp variable.

gapminder %>% 
  group_by(continent) %>% 
  summarise(n=n(),
            meanLife=mean(lifeExp))
gapminder %>% 
  group_by(continent) %>% 
  summarise(PopConti=sum(pop))
pets <- data.frame(names=c(rep('saneesh', 3), rep('appu', 2), 'sanusha'), 
                   pet=c(rep('dog', 3), rep('cat', 2), 'tiger'), number=c(2,2,5,7,8,1), 
                   size=c(rep('medium', 2), rep('small', 3), 'big'))

pets
library(tidyverse)

pets %>% group_by(pet, size) %>% 
  summarise(totalpet= sum(number))
## `summarise()` has grouped output by 'pet'. You can override using the `.groups`
## argument.

summarise

library(tidyverse)
plot <- c(rep(1,2), rep(2,4), rep(3,3))
bird <- c('a','b', 'a','b', 'c', 'd', 'a', 'b', 'c')
area <- c(rep(10,2), rep(5,4), rep(15,3))

birdlist <- data.frame(plot,bird,area)
birdlist
# summarize the following data frame to a summary table.
# option 1
birdlist %>% 
  group_by(plot) %>% 
  summarise(bird = n(), area = unique(area))
# option 2
birdlist %>%
  count(plot, area, name = "bird")
gapminder %>% 
  summarise(mean(lifeExp))
gapminder %>%
  summarise(range(lifeExp))
gapminder %>% 
  filter(country=="India") %>% 
  group_by(country) %>% 
  summarise(GDPmax=max(gdpPercap),
            GDPmin=min(gdpPercap),
            GDPmean=mean(gdpPercap))

count/summarize

count name column

iris %>% count(Species, name = 'how many')
mtcars %>% 
  count(am, name = 'number') %>% 
  as_tibble()
mtcars %>% 
  count(gear, name = 'no. gear')
library(tidyverse)
plot <- c(rep(1,2), rep(2,4), rep(3,3))
bird <- as.factor(c('a','b', 'a','b', 'c', 'd', 'a', 'b', 'c'))
area <- c(rep(10,2), rep(5,4), rep(15,3))

birdlist <- data.frame(plot,bird,area)
birdlist
#birdlist %>%    group_by(plot, area) %>%    mutate(count(bird))


birdlist %>%
  group_by(plot, area) %>%
  dplyr::summarize(bird = n(), # when summarize doesn't work directly use it (dplyr::)like this
            .groups = "drop") # to summarize of a column with reference to two other variables. 

count sites

treatment <- c(rep('ab',2), rep('bgrnf', 8), rep('bgpnf', 4))
site <- c('ab1', 'ab2', 
          rep('bgrnf1', 3), 
          rep('bgrnf2', 2), 
          'bgrnf3', 
          'bgrnf4', 
          'bgrnf5',
          rep('bgpnf1', 2),
          rep('bgpnf2', 2))
data <- data.frame(treatment, site)
library(tidyverse)
# to find the site per each treatment
data %>% group_by(treatment) %>% count(treatment, name= '#sites' )

case when new column

library(dplyr)
library(stringr)
feedback <- c('good_book', 'good_read', 'good_story', 'good for knowledge')
book <- c('ramayana', 'bible', 'encyclopedia', 'Mbharatha')

df <- data.frame(feedback, book)

df %>%
  mutate(response = case_when(str_starts(feedback, 'good') ~ 'good')) %>% 
  select(book, response) %>% as_tibble()

separate

text to columns

df <- data.frame(films = c("Spider_man", "James_bond", "Iron_man", "Bat_man"))
df
df1 <- df %>% 
  separate(films, c("a", "b"), sep='([_])')
df1

unite

df1 %>% unite("names", a:b, remove=FALSE)

join

df1 <- data.frame(id=c(1:4) ,films = c("Spider_man", "James_bond", "Iron_man", "Bat_man"))

df2 <- data.frame(id=c(1:4) ,country= rep("us", 4))
df3 <- left_join(df1, df2, by="id")

spread & gather

We are making a wide format from long format in the first example. The second example is to make a long format from wide.

# the following is already in long format
classdata <- data.frame(
  studentname=c('captian', 'ant', 'james', 'spider', 'tony', 'bat','wonder' ), sibject=c('math', 'his', 'math', 'geo', 'his', 'geo','math' ),
  grade=c('A+', 'B', 'B', 'A+', 'C', 'B+','C' ))

classdata %>% head()
wide.class <- spread(classdata, # name of the data frame
       sibject, # new columns to be made
       grade) # values to go into new columns
head(wide.class)
gather(wide.class, # name of the data frame
       subject, # name of the column to put data into
       grade, # name of the column to put value into
       geo, his, math) %>% # from where values has to be gathered
  drop_na()

join rows

df1 <- data.frame(id=c(1:4) ,films = c("Spider_man", "James_bond", "Iron_man", "Bat_man"))
df2<- data.frame(id=c(5:8) ,films = c("King Cong", "Silence of the lambs", "Intersteller", "Gravity"))
dplyr::bind_rows(df1, df2)

across

for multiple variables

library(tidyverse)
srno <- c(1:2)
film <- c("arabica", "robust")
rate <- c("good", "better")
lang_Eng <- c("yes", "yes")

films <- data.frame(srno, film, rate, lang_Eng)

str(films)
## 'data.frame':    2 obs. of  4 variables:
##  $ srno    : int  1 2
##  $ film    : chr  "arabica" "robust"
##  $ rate    : chr  "good" "better"
##  $ lang_Eng: chr  "yes" "yes"
films <- films %>% 
  mutate(across(c(rate, lang_Eng), as.factor))

str(films)
## 'data.frame':    2 obs. of  4 variables:
##  $ srno    : int  1 2
##  $ film    : chr  "arabica" "robust"
##  $ rate    : Factor w/ 2 levels "better","good": 2 1
##  $ lang_Eng: Factor w/ 1 level "yes": 1 1

everthing

select a key variable and everything or every other columns.

library(gapminder)
gapminder %>% select(pop, everything()) %>% head (3)

toupper

tolower

library(stringr)

data <- data.frame(Dose.Cm=c("d1", "D2", "D3"),
                Len.km=c("High", 'low', 'Low'))
glimpse(data)
## Rows: 3
## Columns: 2
## $ Dose.Cm <chr> "d1", "D2", "D3"
## $ Len.km  <chr> "High", "low", "Low"
data %>% mutate(Dose.Cm= tolower(Dose.Cm), Len.km=toupper(Len.km))

factor

data <- data.frame(Dose.Cm=c("d1", "D2", "D3"),
                Len.km=c("high", 'low', 'medium'))
data <- data %>% mutate(len= as.factor(Len.km))

glimpse(data)
## Rows: 3
## Columns: 3
## $ Dose.Cm <chr> "d1", "D2", "D3"
## $ Len.km  <chr> "high", "low", "medium"
## $ len     <fct> high, low, medium

change order of factor

data %>% mutate(len= fct_relevel(len, c('low', 'medium', 'high')))

parse_number

This drops any non-numeric characters before or after the first number. The grouping mark specified by the locale is ignored inside the number.

library(tidyverse)
class <- c('8th', '9th', '10th')
students <- c('25-30', '35-41', '21-28')
school <- data.frame(class, students)
school
glimpse(school) # notice students is a binned variable it is a not a numeric.
## Rows: 3
## Columns: 2
## $ class    <chr> "8th", "9th", "10th"
## $ students <chr> "25-30", "35-41", "21-28"
school %>% mutate(students= parse_number(students)) %>% glimpse()
## Rows: 3
## Columns: 2
## $ class    <chr> "8th", "9th", "10th"
## $ students <dbl> 25, 35, 21
school %>% mutate(students= parse_number(students))
# now students because number with first value of the column

pivot longer

library(tidyverse)

rawdata <- data.frame(species_1=rnorm(n = 40, mean = 300, sd = 18.5), species_2=rnorm(40, 305, 16.7))


data <- pivot_longer(data = rawdata, cols = species_1:species_2, names_to = 'species', values_to = 'weight')

ggplot

#sthda.com/english/wiki/ggplot2-barplots-quick-start-guide-r-software-and-data-visualization
df <- data.frame(dose=c("D0.5", "D1", "D2"),
                len=c(4.2, 10, 29.5))

bar plot

library(ggplot2)
# Basic barplot
p<-ggplot(data=df, aes(x=dose, y=len)) +
  geom_bar(stat="identity")
p

# Horizontal bar plot
# p + coord_flip()
# Change the width of bars
ggplot(data=df, aes(x=dose, y=len)) +
  geom_bar(stat="identity", width=0.5)

# Change colors
ggplot(data=df, aes(x=dose, y=len)) +
  geom_bar(stat="identity", color="blue", fill="white")

# Minimal theme + blue fill color
p<-ggplot(data=df, aes(x=dose, y=len)) +
  geom_bar(stat="identity", fill="steelblue")+
  theme_minimal()
p

geom_vline

df <- data.frame(dose=c("D0.5", "D1", "D2", 'pp', 'kk', 'rr'),
                len=c(4.2, 10, 29.5, 12, 15, 23))
library(ggplot2)

ggplot(df, aes(len))+
  geom_density()+
  geom_vline(aes(xintercept = mean(len)), col='red', linetype= 'dashed')

scatter plot with lm

library(ggplot2)

ggplot(iris, aes(Petal.Length, Petal.Width))+
  geom_point()+
  geom_smooth(method = 'lm')
## `geom_smooth()` using formula 'y ~ x'

## raincloud plot

library(ggdist)
library(tidyverse)
library(tidyquant)
## Loading required package: lubridate
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
## Loading required package: PerformanceAnalytics
## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## 
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:graphics':
## 
##     legend
## Loading required package: quantmod
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
mpg %>% filter(cyl %in% c(4,6,8)) %>% 
  ggplot(aes( x= factor(cyl), y= hwy, fill=factor(cyl)))+
# add half violin from `ggdist` package
ggdist::stat_halfeye(
  # custom bandwidth
  adjust = 0.5,
  # move geom to right
  justification= -0.2,
  # remove slab interval
  .width = 0,
  point_color= NA
)+
  # add boxplot
  geom_boxplot(
    width= 0.12,
    # remove outliers 
    outlier.colour = NA,
    alpha= 0.5
  )+
  # add dot plots from `ggdist` package
  ggdist::stat_dots(
    #orientation of the plot
    side= 'left',
    # move geom to the left
    justification= 1.1,
    # adjust grouping of observation
    binwidth=0.25
  )+
  # adjust theme
  scale_fill_tq()+
  theme_tq()+
  labs(
    title= 'raincloud plot',
    subtitle= 'showing bimodel distribution of 6 cylinder  vehicles',
    x= 'highway fuel efficiency',
    y= 'cylinders'
  )+
  coord_flip()

hex plot

library(tidyverse)
# install.packages("hexbin")
class <- c(rep('10th', 8))
students <- c('10 to 15',
              "15-20",
              "17 to 24",
              "20  to 25",
              "25 to 30",
              "30 to 40",
              "45 to 47",
              '50 to 55')
latitude <- c(11.50897246,
              11.48323136,
              11.48719031,
              11.46366611,
              11.41097322,
              11.52111154,
              11.44491386,
              11.46569568)
longitude <- c(76.06032062,
               76.06192685,
               76.04266851,
               76.04156575,
               76.05075092,
               76.02846331,
               76.03084141,
               76.01766216)
school <- data.frame(class, students, latitude, longitude)

school %>% mutate(students= parse_number(students)) %>% 
  ggplot(aes(latitude, longitude, z= students))+
  stat_summary_hex()+
  scale_fill_viridis_c(alpha= 0.8)+
  labs(fill='students', title = 'school students')

stat summary

income.data <- data.frame(Village= c(rep('Chittor', 20), 
                                      rep('Bellari', 20)),
                           Income=c(rnorm(n = 20, mean = 1000, sd = 150),
                                    rnorm(n = 20, mean = 1000, sd = 150)))


library(ggplot2)
ggplot(income.data, aes(Village, Income))+
  geom_boxplot()+
  stat_summary(geom = 'point',
               fun= mean,
               col= 'red')

geom_density

income.data <- data.frame(Village= c(rep('Chittor', 20), 
                                      rep('Bellari', 20)),
                           Income=c(rnorm(n = 20, mean = 1000, sd = 150),
                                    rnorm(n = 20, mean = 1000, sd = 150)))


library(ggplot2)
ggplot(income.data) +
  geom_vline(aes(xintercept = mean(Income)), linetype = 'dashed')+
  geom_density(aes(x = Income, color = Village)) +
  geom_vline(xintercept = 959, linetype= 'dotted', col= '#f39c96')+
  geom_vline(xintercept = 1051, linetype= 'dotted', col= '#00bfc4')

pie chart

library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
data <- data.frame(
  category=c("Poaceae",
             'Fabaceae', 
             "Asteraceae", 
             "Acanthaceae", 
             'Rubiaceae', 
             'Euphorbiaceae', 
             'Others'),
  count=c(18,15,8,4,4,3,17 )
)

fig <- data %>% plot_ly(labels= ~ category, values= ~ count)
fig <- fig %>% add_pie(hole= 0.4) %>% 
  layout(title= "Donut charts using Plotly",  showlegend = T)

fig

annotate

library(tidyverse)
df <- tribble(
  ~gender, ~height,
  'male', 12,
  'male', 8,
  'female',11.5,
  'female',11
)

ggplot(df, aes( gender, height))+
  geom_point()+
  annotate(
    geom = 'text',
    x= 1.29,
    y= 11.4,
    label= 'short person',
    color= 'red',
    size= 3,
    fontface= 'italic'
  )+
  annotate(
    geom = 'segment',
    x= 1.05, # starting point on x, this decides length
    xend = 1.3, # end point on x, this decides length
    y= 11.02, # starting point on y
    yend = 11.3, # ending point on y
    color= 'blue',
    linetype= 'dashed'
    )+
  annotate(
    geom = 'segment',
    x= 1.95, # starting point on x, this decides length
    xend = 1.3, # end point on x, this decides length
    y= 8.2, # starting point on y
    yend = 11.3, # ending point on y
    color= 'blue',
    linetype= 'dashed'
    )

months

library(lubridate)
months <- seq(month(1:12)) # make moths
months <- month.abb[months] # make abbriviations
temperature <- c(10,12,22,32,35,30,33,28,29,25,19,14)
myframe <- data.frame(months,temperature) # creating a new data frame

library(tidyverse)
glimpse(myframe)
## Rows: 12
## Columns: 2
## $ months      <chr> "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "S…
## $ temperature <dbl> 10, 12, 22, 32, 35, 30, 33, 28, 29, 25, 19, 14
library(ggplot2)
ggplot(myframe, aes(x=months, y=temperature, group=1))+
  geom_line(col='blue')+
  geom_point(col='red')+
  ggtitle('Temperature of months')+ 
  scale_x_discrete(limits = month.abb) # this will order months on the x axis

p + scale_x_discrete(limits=c("D0.5", "D2"))
## Warning: Removed 1 rows containing missing values (position_stack).

df2 <- data.frame(supp=rep(c("VC", "OJ"), each=3),
                dose=rep(c("D0.5", "D1", "D2"),2),
                len=c(6.8, 15, 33, 4.2, 10, 29.5))
ggplot(data=df2, aes(x=dose, y=len, fill=supp)) +
  geom_bar(stat="identity", position=position_dodge())+
  geom_text(aes(label=len), vjust=1.6, color="white",
            position = position_dodge(0.9), size=3.5)+
  scale_fill_brewer(palette="Paired")+
  theme_minimal()

# Stacked barplot with multiple groups
ggplot(data=df2, aes(x=dose, y=len, fill=supp)) +
  geom_bar(stat="identity")

# Use position=position_dodge()
ggplot(data=df2, aes(x=dose, y=len, fill=supp)) +
geom_bar(stat="identity", position=position_dodge())

# Change the colors manually
p <- ggplot(data=df2, aes(x=dose, y=len, fill=supp)) +
geom_bar(stat="identity", color="black", position=position_dodge())+
  theme_minimal()
# Use custom colors
p + scale_fill_manual(values=c('#999999','#E69F00'))

# Use brewer color palettes
p + scale_fill_brewer(palette="Blues")

Color Palettes

#install.packages(c("tidyverse", "gapminder", "MetBrewer"))

libraries

library(tidyverse)
library(gapminder)
# install.packages('MetBrewer')
library(MetBrewer)

Plot the point plot using GDP per Capita as the x- axis and LE as the y axis. Numerical variable Population to control the size of each point.

plot <- gapminder %>% 
  filter (year==2007) %>% 
  ggplot()+
  labs(x= 'GDP per Capita',
             y= 'Life Expectancy',
       color= 'Population in millions',
       size='Population in millions')+
  theme_minimal()

plot+ geom_point(aes(gdpPercap, lifeExp, size= pop/1000000))

To use color in the plot, assign the Population variable to the color aesthetic. Since nothing is specied, ggplot2 chooses a color spectrum for this numerical variable (shades of blue).

plot + geom_point(aes(gdpPercap, lifeExp, size= pop/1000000, color= pop/1000000))

To control the color spectrum, we need to introduce a color scale. In the following plot, we have to provide a vector of hex color values. You would choose this if you got your colors from one of the mentioned above websites.

plot + geom_point(aes(gdpPercap, lifeExp, size= pop/1000000, color= pop/1000000))+
  scale_color_gradientn(colors = c("#003049", "#D62828", "#F77F00", "#FCBF49", "#EAE2B7"))

To apply one of the MetBrewer palettes, replace the hex-vector with a MetBrewer function. Within the function call, you provide the palette’s name, then several colors, and tell it that we need a continuous palette since it is a numerical variable.

plot + geom_point(aes(gdpPercap, lifeExp, size= pop/1000000, color= pop/1000000))+
  scale_color_gradientn(colors = met.brewer('Cross', n=500, type = 'continuous'))

You might also want to use color palettes with non-numerical variables. Let us assume we want to apply color to the Continent variable. This implies using a manual color scale and providing a MetBrewer palette.

plot + geom_point(aes(gdpPercap, lifeExp, size= pop/1000000, color= continent))+
  scale_color_manual(values = met.brewer('Navajo', 5))

Please note if you want to apply color to the fill aesthetic rather than the color aesthetic, consider using the scale_fill_manuel function instead of the scale_color_manuel. This is useful for boxplots or bar charts.

box <- gapminder %>% 
  filter(gdpPercap< 60000) %>% 
  ggplot(aes(continent, gdpPercap, color= year, fill= continent))+
           geom_boxplot()+
  theme_minimal()+ labs( x= 'Continent', y= 'GDP per Capita', fill= 'Continent')

scale fill manual

themes

df <- data.frame(
    Names=as.factor(c('Bacteria', 'Yeast', 'None')),
    Quantity=c(2.5, 5.5, 7.5))

library(ggplot2)
library(tidyverse)
df <- df %>% mutate(Names= fct_relevel(Names, c('Bacteria', 'Yeast', 'None')))

ggplot(df, aes(Names, Quantity, fill= Names))+
    geom_bar(stat = 'identity')+
    scale_fill_manual(values = c('#110a62', '#fcd749','#b5b4b5'))+
  labs(y='Necter pH', x= 'Microbe added to nectar')+
  theme_classic()+
    theme(legend.position = 'none', axis.ticks.x = element_blank())+
  theme(axis.text = element_text(size = 22, color= 'black'))+
theme(axis.line.x = element_blank())+
  theme(axis.ticks = element_line(size = 1, color="black"), 
   axis.ticks.length = unit(.5, "cm"))+
  theme(text = element_text(size = 22))

graphics

x11() # opne a new window for graphics
graphics.off() # close the new window 

Normal distribution

Normal distribution, also known as the Gaussian distribution, is a probability distribution that is symmetric about the mean, showing that data near the mean are more frequent in occurrence than data far from the mean.

library(tidyverse)
n = 1000
mean = 170 # cm
sd = 6.35 # cm
binwidth= 0.3
set.seed(1234)
df <- data.frame(x=rnorm(n, mean, sd))
ggplot(df, aes(x = x, mean = mean, sd = sd, binwidth = binwidth, n = n))+
    theme_bw()+
  geom_histogram(binwidth = binwidth, 
        colour = "white", fill = "lightblue", size = 0.1)+
stat_function(fun = function(x) dnorm(x, mean = mean, sd = sd) * n * binwidth,
    color = "darkred", size = 1)

Functions

dice

dice <- c(1:6)

myluck<- function(x){
  myluck <- sample(dice, size = 1, replace = T)
  return(myluck)
}

myluck()
## [1] 2

pick a name

names <- c('saneesh','appu','sanusha')
who <- function(x){
  who <- sample(names, 1, T)
  return(who)
}

who()
## [1] "saneesh"

function to split

df <- data.frame(name=as.factor(c('James Bond', 'Spider Man', 'Iron Man')))
# df <- df %>% separate(name, c('Genus', 'Species'), sep = '([ ])')

shorten <- function(df){
  name_split <- df %>% separate(name, c('Genus', 'Species'), sep = '([ ])')
  print(name_split)
}

shorten(df)
##    Genus Species
## 1  James    Bond
## 2 Spider     Man
## 3   Iron     Man

Rmarkdown

knitr golbal options

to apply to every chunk in the file

inside the chunk write knitr::opts_chunk$set(include= ,echo = , message= , warning= )

# knitr::opts_chunk$set(message = TRUE, echo = TRUE, warning = TRUE)

include: to show or hide code and results from appearing
echo: to show or hide code in the output but shows result
message to hide or show the messages generated by the code
warning: to show or hide warning generated by the code

these options can be written for individual chunks as well

## [1] 5

Headings

# Heading 1
## Heading 2 ### Heading 3

italics
italic

bold
bold

plot() to show r code/function
@Saneesh

blockquotes are writtedn after >

this is a blockquote
— Saneesh

plain code

hello

unordered items

  • item 1
  • item 2
    • sub item 1a
    • sub item 2b

ordered items

  1. Item 1
  2. Item 2
    • Item 2a # give two spaces before the +
    • Item 2b

writing mathematical functions

table

using knitr::kable()

Sepal.Length Sepal.Width Petal.Length Petal.Width Species
6.3 3.3 6.0 2.5 virginica
6.3 2.9 5.6 1.8 virginica
6.3 2.7 4.9 1.8 virginica
6.3 2.8 5.1 1.5 virginica
6.3 3.4 5.6 2.4 virginica
6.3 2.5 5.0 1.9 virginica

Resources

colorhunt colors colorpaletts colorpaletts coloradobe mycolor colormind